# encoding: utf-8
# @Time:    :2024/12/22 15:14

import json
import pandas as pd

train_file = "./data/train-00000-of-00001-789dc5dece0f1fc1.parquet"
test_file = "./data/test-00000-of-00001-8ecd46436fadcf7f.parquet"
train_json = "./data/train.json"
test_json = "./data/test.json"

df = pd.read_parquet(test_file)
print(df.head())

print(df.columns)
# Index(['prompt', 'chosen', 'rejected'], dtype='object')

datas = df.to_dict("records")

result = []
for one in datas:
    instruction = one.get("prompt", "").strip()
    output = one.get("chosen", "").strip()
    reject = one.get("rejected", "").strip()
    if instruction and output and reject:

        data = {
            "instruction": instruction,
            "input": "",
            "output": output,
            "reject": reject
        }
        result.append(data)

with open(test_json, "w", encoding="utf-8") as f:
    for data in result:
        f.write(json.dumps(data, ensure_ascii=False)+"\n")




if __name__ == "__main__":
    pass
